### Specify which sample are we using ###

# CHOOSE sample to use
sample_choice <- "10_percent_sample"

if (sample_choice == "full") {
  filename <- "data/derived/clean_payroll_provider_firm_cz.rds"
} else if (sample_choice == "10_percent_sample") {
  filename = "data/derived/clean_payroll_provider_firm_cz_10_percent_sample.rds"
} else if (sample_choice == "1_percent_sample") {
  filename = "data/derived/clean_payroll_provider_firm_cz_1_percent_sample.rds"
}

#CHOOSE whether mild balanced panel is 3x pre post or 1x pre post (only one can be TRUE or both need to be FALSE -- latter is unbalanced panel case)
#Specifically: define_mild_1t == T imposes that each firm has to appear at least 1 time both in the pre and post periods
#------------- define_mild_3t == T imposes that each firm has to appear at least 3 times both in the pre and post periods
define_mild_1t = FALSE
define_mild_3t = TRUE

#CHOOSE: restriction to only include firms that have at least K employees throughout its panel
define_threshold_mild = 5


#-------------------------------------------------------------------------------------------------------------------#
# Load datasets
events_df <- read.csv("data/credit_bureau/events.csv") %>% 
  mutate(mdate=ymd(paste0(archive,"01")))

wage <- readRDS(filename) 

### Define functions needed ###

### OUTPUTTING EMPTY DATAFRAME!!! 

result_df_1t <- data.frame(event_id = character(), employment_lost_1t = numeric(), stringsAsFactors = FALSE)

result_df_3t <- data.frame(event_id = character(), employment_lost_3t = numeric(), stringsAsFactors = FALSE)

# Function to create balanced panel indicators
create_balanced_indicators <- function(df, 
                                       emp_threshold, 
                                       event_id, 
                                       keep_balanced_short=FALSE,
                                       keep_balanced_long=FALSE,                                       
                                       mild_emp_threshold = 0, # number of employees firms must have at least once in the panel (default is zero)
                                       mild_balanced_1t = FALSE, #firm must show up at least 1 time in the pre-event AND post-event periods (FALSE by default)
                                       mild_balanced_3t = FALSE) #firm must show up at least 3 times in the pre-event AND post-event periods (FALSE by default)
{
  
  # Step 1: Create temporary variables for the previous 12 months
  for (num in 1:12) {
    temp_var <- paste0("e_m", num, "_temp")
    final_var <- paste0("e_m", num)
    
    df <- df %>%
      mutate(!!temp_var := (etime == -num & tot_emp > emp_threshold)) %>%
      group_by(clt_client, czone) %>%
      mutate(!!final_var := max(!!sym(temp_var), na.rm = TRUE)) %>%
      ungroup() %>%
      select(-!!sym(temp_var))
    
  }
  
  
  # Step 3: Create temporary variables for the next 12 months
  for (num in 0:11) {
    temp_var <- paste0("e_", num, "_temp")
    final_var <- paste0("e_", num)
    
    df <- df %>%
      mutate(!!temp_var := (etime == num & tot_emp >= emp_threshold)) %>%
      group_by(clt_client, czone) %>%
      mutate(!!final_var := max(!!sym(temp_var), na.rm = TRUE)) %>%
      ungroup() %>%
      select(-!!sym(temp_var))
  }
  
  
  # also add restriction flag for a firm having at least "mild_emp_threshold" employees once throughout the panel
  df <- df %>% 
    group_by(clt_client, czone) %>%
    mutate(at_least_emp_threshold_once = (max(tot_emp) >= mild_emp_threshold)) %>%
    ungroup()
  
  # Step 4: Create the balanced_long and balanced_short indicators
  df <- df %>%
    mutate(balanced_long = pmin(!!!syms(paste0("e_m", 12:1)), !!!syms(paste0("e_", 0:11)), na.rm = TRUE)) %>%
    mutate(balanced_short = pmin(!!!syms(paste0("e_m", 6:1)), !!!syms(paste0("e_", 0:5)), na.rm = TRUE),
           #: create flag indicator for firms in the mild-balanced panel (before and after the event)
           balanced_mild_pre_event = rowSums(across(e_m12 : e_m1)), #sums number of times firm shows up in the pre-period
           balanced_mild_post_event = rowSums(across(e_0 : e_11)),   #sums number of times firm shows up in the post-period
           balanced_mild_1t = (at_least_emp_threshold_once == 1 & 
                                 balanced_mild_pre_event >= 1 & balanced_mild_post_event >= 1),
           balanced_mild_3t = (at_least_emp_threshold_once == 1 & 
                                 balanced_mild_pre_event >= 3 & balanced_mild_post_event >= 3)
    )
  
  # Step 5 : Inspect what percentage of firms and employment we will lose by restricting ourselves to a balanced panel
  
  for(event_id in event_id) {
    df <- df 
    inspect_balanced_panel(df,event_id)
  }
  
  
  # Step 6: Filter the dataset to keep only observations with balanced_short == 1
  df_final <- df %>%
    select(-starts_with("e_")) %>%
    select(clt_client, czone,month, mdate,trt_exp,
           etime,starts_with("balanced"), mw,share_affected,tot_emp, clt_industry_code_3d,
           starts_with("month_"),everything()) %>%
    arrange(clt_client, etime)
  
  # Step 7: keep only firms in the panel if indicated
  if (keep_balanced_short) {
    df_final <- df_final %>% filter(balanced_short == 1)
  }
  if (keep_balanced_long) {
    df_final <- df_final %>% filter(balanced_long == 1)
  } 
  
  if (mild_balanced_1t) {
    df_final <- df_final %>% filter(balanced_mild_1t == 1)
  }
  if (mild_balanced_3t) {
    df_final <- df_final %>% filter(balanced_mild_3t == 1)
  }
  
  # Step 8: Make sure that firms have one observation pre and post treatment, drop otherwise
  df_final <- df_final %>%
    group_by(clt_client, czone, trt_exp) %>%
    mutate(pre_post = ifelse(min(etime) < 0 & max(etime) > 0,1,0)) %>% # pre_post = 1 if firm has observation before and after the treatment
    ungroup() %>%
    filter(pre_post==1) %>%
    select(-pre_post)
  
  # -- Step 9: Winsorize wage variable "avg_wage_exact" at the P1 and P99 (for each event, since the function is ran by event)
  df_final <- df_final %>%
    mutate(avg_wage_exact_wins_1_99 = DescTools::Winsorize(avg_wage_exact, probs = c(.01, .99)),
           ln_avg_wage_exact_wins_1_99 = log(avg_wage_exact_wins_1_99))
  
  return(df_final)
}

# Function to inspect the impact of dropping firms with balanced_short == 0 and balanced_long == 0
inspect_balanced_panel <- function(df, event_id) {
  

  # Total number of firms and total employment
  total_firms <- df %>% 
    summarise(total_firms = n_distinct(clt_client)) %>%
    pull(total_firms)
  
  total_employment <- df %>% 
    summarise(total_employment = sum(tot_emp, na.rm = TRUE)) %>%
    pull(total_employment)
  
  # For balanced_short
  firms_lost_short <- df %>%
    filter(balanced_short == 0) %>%
    summarise(firms_lost = n_distinct(clt_client)) %>%
    pull(firms_lost)
  
  employment_lost_short <- df %>%
    filter(balanced_short == 0) %>%
    summarise(employment_lost = sum(tot_emp, na.rm = TRUE)) %>%
    pull(employment_lost)
  
  # For balanced_long
  firms_lost_long <- df %>%
    filter(balanced_long == 0) %>%
    summarise(firms_lost = n_distinct(clt_client)) %>%
    pull(firms_lost)
  
  employment_lost_long <- df %>%
    filter(balanced_long == 0) %>%
    summarise(employment_lost = sum(tot_emp, na.rm = TRUE)) %>%
    pull(employment_lost)
  
  #For mild balanced 1 t
  firms_lost_mild_1t = df %>%
    filter(balanced_mild_1t == 0) %>%
    summarise(firms_lost = n_distinct(clt_client)) %>%
    pull(firms_lost)
  
  employment_covered_mild_1t <- df %>%
    filter(balanced_mild_1t == 1) %>%
    summarise(employment_covered = sum(tot_emp, na.rm = TRUE)) %>%
    pull(employment_covered)
  
  #For mild balanced 3 t
  firms_lost_mild_3t = df %>%
    filter(balanced_mild_3t == 0) %>%
    summarise(firms_lost = n_distinct(clt_client)) %>%
    pull(firms_lost)
  
  employment_covered_mild_3t <- df %>%
    filter(balanced_mild_3t == 1) %>%
    summarise(employment_covered = sum(tot_emp, na.rm = TRUE)) %>%
    pull(employment_covered)
  
  #result_df_1t

  
  # Calculate shares
  share_firms_lost_short <- firms_lost_short / total_firms * 100
  share_employment_lost_short <- employment_lost_short / total_employment * 100
  
  share_firms_lost_long <- firms_lost_long / total_firms * 100
  share_employment_lost_long <- employment_lost_long / total_employment * 100
  

  share_employment_covered_1t <- employment_covered_mild_1t / total_employment * 100
  share_employment_covered_3t <- employment_covered_mild_3t / total_employment * 100
  
  result_df_1t <<- rbind(result_df_1t, data.frame(event_id = event_id, employment_covered_1t = share_employment_covered_1t))
  
  result_df_3t <<- rbind(result_df_3t, data.frame(event_id = event_id, employment_covered_3t = share_employment_covered_3t))
  
  
  # # Print the results
  # cat("Balanced Short Panel:\n")
  # cat(sprintf("Share of firms lost: %.2f%%\n", share_firms_lost_short))
  # cat(sprintf("Share of total employment lost: %.2f%%\n\n", share_employment_lost_short))
  # 
  # cat("Balanced Long Panel:\n")
  # cat(sprintf("Share of firms lost: %.2f%%\n", share_firms_lost_long))
  # cat(sprintf("Share of total employment lost: %.2f%%\n", share_employment_lost_long))
  # cat("\n")
  # 
  # #
  # cat("Mild-balanced 1t panel:\n")
  # cat(sprintf("Share of firms lost: %.2f%%\n", share_firms_lost_mild_1t))
  # cat(sprintf("Share of total employment lost: %.2f%%\n", share_employment_lost_1t))
  # cat("\n")
  # 
  # cat("Mild-balanced 3t panel:\n")
  # cat(sprintf("Share of firms lost: %.2f%%\n", share_firms_lost_mild_3t))
  # cat(sprintf("Share of total employment lost: %.2f%%\n", share_employment_lost_3t))
  # cat("\n")
  # 
  
  
}

# Function to generate variables and filter single-CZ firms
filter_single_cz_firms <- function(df) {
  
  # Step 1: Calculate total employment by CZ (tot_emp_CZ)
  df <- df %>%
    group_by(czone) %>%
    mutate(tot_emp_CZ = sum(tot_emp, na.rm = TRUE)) %>%
    ungroup()
  
  # Step 2: Calculate total employment per firm per CZ (tot_estab_CZ)
  df <- df %>%
    group_by(clt_client, czone) %>%
    mutate(tot_estab_CZ = sum(tot_emp, na.rm = TRUE)) %>%
    ungroup()
  
  # Step 3: Calculate the number of firms per CZ (n_firms_CZ)
  df <- df %>%
    group_by(czone) %>%
    mutate(n_firms_CZ = n_distinct(clt_client)) %>%
    ungroup()
  
  # Step 4: Calculate the number of CZs per firm (n_CZs_firm)
  df <- df %>%
    group_by(clt_client) %>%
    mutate(n_CZs_firm = n_distinct(czone)) %>%
    ungroup()
  
  # Step 5: Confirm the share of single-CZ firms
  single_CZ_firms <- df %>%
    group_by(clt_client) %>%
    summarise(n_CZs = n_distinct(czone)) %>%
    summarise(single_CZ_share = mean(n_CZs == 1))
  
  cat(sprintf("Share of single-CZ firms: %.2f%%\n", single_CZ_firms$single_CZ_share * 100))
  
  # Step 6: Drop multi-CZ firms
  df_single_CZ <- df %>%
    filter(n_CZs_firm == 1)
  
  # Return the filtered dataset
  return(df_single_CZ)
}

# Function to create same_ind_policy and sample_ind variables
create_industry_flags <- function(df) {
  
  # Define the industry codes for each variable
  policy_codes <- c(452, 452, 722, 452, 454) # NAICS codes for policy firms
  sample_codes <- c(561, 452, 722, 445, 448, 492, 454, 444, 451)
  
  # Create the same_ind_policy variable
  df <- df %>%
    mutate(same_ind_policy = clt_industry_code_3d %in% policy_codes)
  
  # Create the sample_ind variable
  df <- df %>%
    mutate(sample_ind = clt_industry_code_3d %in% sample_codes)
  
  # Change order of variables
  df <- df %>%
    select(clt_client, czone,month, mdate,trt_exp, 
           etime,starts_with("balanced"), mw,share_affected,tot_emp, 
           clt_industry_code_3d,same_ind_policy, sample_ind,
           ends_with("_CZ"),starts_with("month_"),everything()) %>%
    arrange(clt_client, etime)
  
  
  return(df)
}

create_wage_shares <- function(df) {
  
  # extract minimum wage value, we use first value because mw is the same for the event
  min_wage <- df$mw[1] 
  
  df <- df %>%
    mutate(share_wage_ltmw = !!sym(paste0("share_wage_lt_",min_wage)),
           share_wage_mw = !!sym(paste0("wage_",min_wage))/tot_emp,
           share_wage_gtmw = !!sym(paste0("share_wage_gt_",min_wage))
    )
  
  return(df)
}

# Function to calculate the gap and placebo gap
calculate_gap <- function(df,start_gap = -6, end_gap = -3, placebo_start_gap = -12, placebo_end_gap = -9) {
  
  # extract minimum wage value, we use first value because mw is the same for the event
  mw <- df$mw[1] 
  
  bmw <- mw - 1 # bin before minimum wage
  
  # Step 1: Calculate gap for wage bins below the minimum wage
  for (i in 7:bmw) {
    if (i == 7) {
      wage_var <- "wage_lt8"
    } else {
      wage_var <- paste0("wage_", i)
    }
    gap_var <- paste0("gap_", mw - i)
    df <- df %>%
      # total payment that would be necessary for the company to pay their workers as much as the policy's minimum wage
      mutate(!!sym(gap_var) := !!sym(wage_var) * (mw - i))  
  }
  
  # Step 2: For wage bins at or above the minimum wage, set the gap to 0
  for (i in mw:29) {
    gap_var <- paste0("gap_", i)
    df <- df %>%
      mutate(!!sym(gap_var) := 0)
  }
  
  # Step 3: Calculate total wage bill per bin (denominator)
  for (i in 8:29) {
    if (i == 7) {
      wage_var <- "wage_lt8"
    } else {
      wage_var <- paste0("wage_", i)
    }
    sumwage_var <- paste0("sumwage_", i)
    df <- df %>%
      mutate(!!sym(sumwage_var) := !!sym(wage_var)* i)
  }
  
  # Step 4: Calculate the gap numerator and denominator
  df <- df %>%
    rowwise() %>%
    mutate(sum_gap = sum(c_across(starts_with("gap_")), na.rm = TRUE), # numerator
           sum_wage = sum(c_across(starts_with("sumwage_")), na.rm = TRUE)) %>% #denominator
    ungroup()
  
  # Step 5: Calculate the gap variable
  df <- df %>%
    mutate(gap = sum_gap / sum_wage,
           gap = if_else(!between(etime, start_gap, end_gap), NA_real_, gap))
  
  # Step 6: Calculate the average gap by commuting zone and firm for the specified period
  df <- df %>%
    group_by(czone, clt_client) %>%
    mutate(np_gap = mean(gap, na.rm = TRUE)) %>%
    ungroup()
  
  # Step 7: Repeat for placebo gap (using the placebo window)
  df <- df %>%
    mutate(placebo_gap = sum_gap / sum_wage,
           placebo_gap = if_else(!between(etime, placebo_start_gap, placebo_end_gap), NA_real_, placebo_gap)) %>%
    group_by(czone, clt_client) %>%
    mutate(np_placebo_gap = mean(placebo_gap, na.rm = TRUE)) %>%
    ungroup()
  
  # Step 8: Eliminate variables that we are not going to use anymore
  df <- df %>%
    select(-starts_with("gap_"),-starts_with("sumwage"))
  
  # Return the modified data frame
  return(df)
}

keep_months_within_year_of_event <- function(df) {
  
  # Restrict data to within a year of event
  event_start = -12
  event_end = 11
  df <- df %>%
    mutate(
      month_date = ymd(paste0(month,"01")),
      #add -1 so that first month after policy is 0
      etime = interval(month_date,mdate)/months(1) - 1) %>% 
    filter(etime >= -12 & etime <= 11) %>%
    select(clt_client, czone,mdate,trt_exp, 
           etime,mw,share_affected,tot_emp, clt_industry_code_3d,
           starts_with("month"),everything()) %>%
    arrange(clt_client, etime)
  
  return(df)
  
}

merge_treatment_variables <- function(df, event_id) {
  
  # Merge data with the treatment variables dataset for the corresponding event
  filename <- paste0("data/derived/equifax/all_gap_measures_", event_id,"_qtrly_wage_bill.dta")
  treatment_df <- read_dta(filename) %>%
    rename(czone=cz) %>%
    select(czone, T, starts_with("wage_bill"),starts_with("employment_share"),
           starts_with("super"),avgplacebogap)
  
  df <- df %>% inner_join(treatment_df, by = "czone") #%>%

  
}

label_variables <- function(df) {
  
  # Apply labels individually to each variable
  var_label(df$clt_client) <- "Non-policy company code"
  var_label(df$trt_exp) <- "experiment code"
  var_label(df$mw) <- "new policy minimum wage"
  var_label(df$balanced_short) <- "fully balanced -6 to 5 months from policy month"
  var_label(df$balanced_long) <- "fully balanced -12 to 11 months from policy month"
  var_label(df$etime) <- "months since event"
  var_label(df$share_wage_ltmw) <- "share with wages less than minimum wage"
  var_label(df$share_wage_mw) <- "share with wage in minimum wage bin"
  var_label(df$share_wage_gtmw) <- "share with wage at least minimum wage"
  var_label(df$T) <- "average gap"
  var_label(df$share_affected) <- "share affected by policy"
  var_label(df$months_since_last_policy) <- "months since last policy at company"
  var_label(df$months_until_next_policy) <- "months until next policy at company"
  
  
  return(df)
  
}

save_results_csv <- function() {
  write.csv(result_df_1t, "figures_tables/result_df_1t.csv", row.names = FALSE)
  write.csv(result_df_3t, "figures_tables/result_df_3t.csv", row.names = FALSE)
}


create_experiment_dataset <- function(df, event_id, events_df) {
  
  # Extract useful values from events database
  df$mw <- events_df$mw[events_df$eventid == event_id]
  df$mdate <- events_df$mdate[events_df$eventid == event_id]
  df$share_affected <- events_df$share_affected[events_df$eventid == event_id]
  df$months_since_last_policy <- events_df$months_since_last_policy[events_df$eventid == event_id]
  df$months_until_next_policy <- events_df$months_until_next_policy[events_df$eventid == event_id]
  df$trt_exp <- event_id
  
  # Do the data cleaning necessary to prepare data for the regressions
  cat("\n")
  cat(sprintf("Processing event: %d.\n", event_id))
  cat("\n")
  
  df <- keep_months_within_year_of_event(df)
  
  df <- create_balanced_indicators(df, emp_threshold = 0, event_id=event_id, 
                                   mild_emp_threshold = define_threshold_mild, 
                                   mild_balanced_1t = define_mild_1t,
                                   mild_balanced_3t = define_mild_3t
  ) 

  
    df <- filter_single_cz_firms(df)
   # 
    df <- create_industry_flags(df)
   # 
     df <- create_wage_shares(df)
   #  
     df <- calculate_gap(df)
   #  
     df <- merge_treatment_variables(df, event_id)
   #  
     df <- label_variables(df)
  
  return(df)
  
}

### Create stacked dataset ###

events_to_exclude <- c(4,15,16)
events_id <- setdiff(1:23,events_to_exclude)
stacked_dataset <- map_dfr(events_id, ~create_experiment_dataset(wage,.x,events_df))

save_results_csv()

colnames(result_df_3t) <- c("Event ID", "Employment Covered (%)")  

latex_table <- xtable(result_df_3t, caption = "Percent of Employment Coverd by Mildly Balanced Firms by VMW Event")
align(latex_table) <- c("1", rep("c", ncol(result_df_3t)))

print(
  latex_table, 
  file = "figures_tables/tablee3_e3_pct_emp_bal_firm.tex", 
  include.rownames = FALSE, 
  sanitize.text.function = identity, 
  caption.placement = "top", 
  booktabs = TRUE
)

print(latex_table, 
      include.rownames = FALSE, 
      santitize.text.function = identity, 
      booktabs= TRUE)


#Create suffix for file name depending on the requirement for the mild-balanced panel
suffix_mild_balanced = ifelse(define_mild_1t == T, 
                              "pre_post_1t",
                              ifelse(define_mild_3t == T,
                                     "pre_post_3_t",
                                     "unbalanced"))

saveRDS(stacked_dataset,paste0("data/derived/mild_balanced_", suffix_mild_balanced, "_stacked_nonpolicy_firm_payroll_provider_dataset_", sample_choice, ".rds"))





